from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests


class Crawler():
    '''
    获取各种网站的代理数据，维护代理池数据基础
    '''
    def crawl_66ip(self,page_count=1000):
        '''
        获取66免费代理网代理ip
        地址：http://www.66ip.cn/10.html
        :return: 代理列表，格式：ip:port
        '''
        browers = webdriver.Chrome()
        urls = ['http://www.66ip.cn/' + str(page) + '.html' for page in range(1,page_count + 1)]
        for url in urls:
            print('正在获取66免费代理网代理ip数据')
            browers.get(url)
            time.sleep(2)
            html=browers.page_source
            soup = BeautifulSoup(html,'lxml')
            porxes = soup.find_all(name = 'div',class_='container')
            for porxe in porxes:
                for t in porxe.find_all(name = 'tr'):
                    ip_list = []
                    if t == porxe.find_all(name = 'tr')[0]:
                        continue
                    else:
                        for st in t:
                            tmp_list = []
                            tmp_list.append(st.text)
                            ip_list.append(tmp_list)
                    ip_list = ip_list[0:2]
                    ip_port_list = []
                    for ip in ip_list:
                        for t in ip:
                            ip_port_list.append(t)
                    yield (ip_port_list[0] +':'+ ip_port_list[1])


    def crawl_xici(self,page_count=1000):
        '''
        获取https://www.xicidaili.com/nn/3 首页代理数据
        :return: 代理列表，格式：ip:port
        '''
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
            'Connection': 'close'
        }
        urls = ['https://www.xicidaili.com/nn/' + str(page) for page in range(1,page_count + 1)]
        try:
            for url in urls:
                time.sleep(3)
                print('正在获取西刺代理网代理ip数据')
                response = requests.get(url=url, headers=headers)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'lxml')
                    tables = soup.find_all(name='table', id='ip_list')
                    for table in tables:
                        for t in table.find_all(name='tr'):
                            if t == table.find_all(name='tr')[0]:
                                continue
                            ilist = t.text.strip().replace('\n','|').split('|')
                            proxy = ilist[0] +':'+ilist[1]
                            yield proxy
        except requests.ConnectionError as e:
            print('Error', e.args)

# if __name__ == '__main__':
#     c = Crawler()
#     c.crawl_66ip()